package Q17_26_Sparse_Similarity;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.Map.Entry;
import java.util.Set;
import CtCILibrary.AssortedMethods;
import CtCILibrary.HashMapList;
public class QuestionB {
public static HashMap<DocPair, Double> computeSimilarities(HashMap<Integer, Document> documents) {
HashMapList<Integer, Integer> wordToDocs = groupWords(documents);
HashMap<DocPair, Double> similarities = computeIntersections(wordToDocs);
adjustToSimilarities(documents, similarities);
return similarities;
}
/* Create hash table from each word to where it appears. */
public static HashMapList<Integer, Integer> groupWords(HashMap<Integer, Document> documents) {
HashMapList<Integer, Integer> wordToDocs = new HashMapList<Integer, Integer>();
for (Document doc : documents.values()) {
ArrayList<Integer> words = doc.getWords();
for (int word : words) {
wordToDocs.put(word, doc.getId());
}
}
return wordToDocs;
}
/* Compute intersections of documents. Iterate through each list of
* documents and then each pair within that list, incrementing the
* intersection of each page. */
public static HashMap<DocPair, Double> computeIntersections(HashMapList<Integer, Integer> wordToDocs) {
HashMap<DocPair, Double> similarities = new HashMap<DocPair, Double>();
Set<Integer> words = wordToDocs.keySet();
for (int word : words) {
ArrayList<Integer> docs = wordToDocs.get(word);
Collections.sort(docs);
for (int i = 0; i < docs.size(); i++) {
for (int j = i + 1; j < docs.size(); j++) {
increment(similarities, docs.get(i), docs.get(j));
}
}
}
return similarities;
}
/* Increment the intersection size of each document pair. */
public static void increment(HashMap<DocPair, Double> similarities, int doc1, int doc2) {
DocPair pair = new DocPair(doc1, doc2);
if (!similarities.containsKey(pair)) {
similarities.put(pair, 1.0);
} else {
similarities.put(pair, similarities.get(pair) + 1);
}
}
/* Adjust the intersection value to become the similarity. */
public static void adjustToSimilarities(HashMap<Integer, Document> documents, HashMap<DocPair, Double> similarities) {
for (Entry<DocPair, Double> entry : similarities.entrySet()) {
DocPair pair = entry.getKey();
Double intersection = entry.getValue();
Document doc1 = documents.get(pair.doc1);
Document doc2 = documents.get(pair.doc2);
double union = (double) doc1.size() + doc2.size() - intersection;
entry.setValue(intersection / union);
}
}
public static void main(String[] args) {
int numDocuments = 10;
int docSize = 5;
HashMap<Integer, Document> documents = new HashMap<Integer, Document>();
for (int i = 0; i < numDocuments; i++) {
int[] words = AssortedMethods.randomArray(docSize, 0, 10);
ArrayList<Integer> w = Tester.removeDups(words);
System.out.println(i + ": " + w.toString());
Document doc = new Document(i, w);
documents.put(i, doc);
}
HashMap<DocPair, Double> similarities = computeSimilarities(documents);
Tester.printSim(similarities);
}
}